# load raw data files
data <- read.csv("../data/filledDatabase111119NUMONLY.csv")
# clean data 
data <- clean_data(data) %>% collapse_data()

# separate compound and group_cate from the predictors
compound <- data$Compound
group_cat <- data$GroupCat

# create data constructed by first 13 PC's
data <- select(data, -c("Compound","X"))
data_pca <- get_pc_space(data[,-1], k = 13) %>% scale() %>% data.frame()

# split data into 5 folds for cross validation later
folds <- caret::createFolds(1:nrow(data), k = 5, list = TRUE, returnTrain = FALSE)

Multinomial Regression

library(glmnet)
X = data[,-1] %>% as.matrix()
Y = data$GroupCat %>% as.matrix()

Shrinkage

Ridge

model_ridge <- glmnet(x = X, y = Y, alpha = 0, family = "multinomial")
plot(model_ridge, xvar = "lambda", label = TRUE)

LASSO

model_lasso <- glmnet(x = X, y = Y, alpha = 1, family = "multinomial")
plot(model_lasso, xvar = "lambda", label = TRUE)

Coefficients

Ridge

ridge_cv <- cv.glmnet(x = X, y = Y, alpha = 0, nfolds = 5, type.measure = "deviance", family = "multinomial") 
ridge_cv %>% get_coef(tuning_parameter = ridge_cv$lambda.min) %>% plot_coef()

LASSO

lasso_cv <- cv.glmnet(x = X, y = Y, alpha = 1, nfolds = 5, type.measure = "deviance", family = "multinomial")
lasso_cv %>% get_coef(tuning_parameter = lasso_cv$lambda.min) %>% plot_coef()

Elastic Net

library(caret)
elastic_cv <- 
  train(GroupCat ~., data = data, method = "glmnet",
    trControl = trainControl("cv", number = 5),
    tuneLength = 10
    )
elastic_cv$finalModel %>% get_coef(tuning_parameter = elastic_cv$bestTune$lambda) %>% plot_coef()

Accurate classification rate

Ridge

tb_ridge = prediction_table(alpha = 0, lambda = ridge_cv$lambda.min) 
tb_ridge$r %>% print_accurate_tb()
Fold1 Fold2 Fold3 Fold4 Fold5 Mean
0.6477273 0.625 0.6741573 0.5340909 0.6704545 0.630286
tb_ridge$t %>% highlight_tb_count()
16 3 5 6 Other
16 68 12 9 2 20
3 15 111 34 9 6
5 10 18 66 0 5
6 8 2 0 28 0
Other 6 1 6 0 5
Total 107 144 115 39 36
tb_ridge$t %>% highlight_tb_percent()
16 3 5 6 Other
16 0.64 0.08 0.08 0.05 0.56
3 0.14 0.77 0.3 0.23 0.17
5 0.09 0.12 0.57 0 0.14
6 0.07 0.01 0 0.72 0
Other 0.06 0.01 0.05 0 0.14
Total 100% 100% 100% 100% 100%

LASSO

tb_lasso = prediction_table(alpha = 1, lambda = lasso_cv$lambda.min) 
tb_lasso$r %>% print_accurate_tb()
Fold1 Fold2 Fold3 Fold4 Fold5 Mean
0.6590909 0.6136364 0.6741573 0.5340909 0.6590909 0.6280133
tb_lasso$t %>% highlight_tb_count() 
16 3 5 6 Other
16 66 9 11 2 18
3 14 111 33 9 7
5 10 20 67 0 5
6 9 2 0 28 1
Other 8 2 4 0 5
Total 107 144 115 39 36
tb_lasso$t %>% highlight_tb_percent()
16 3 5 6 Other
16 0.62 0.06 0.1 0.05 0.5
3 0.13 0.77 0.29 0.23 0.19
5 0.09 0.14 0.58 0 0.14
6 0.08 0.01 0 0.72 0.03
Other 0.07 0.01 0.03 0 0.14
Total 100% 100% 100% 100% 100%

Elastic Net

tb_elastic = prediction_table(alpha = elastic_cv$bestTune[[1]], lambda = elastic_cv$bestTune[[2]]) 
tb_elastic$r %>% print_accurate_tb()
Fold1 Fold2 Fold3 Fold4 Fold5 Mean
0.6363636 0.6363636 0.7078652 0.5681818 0.625 0.6347549
tb_elastic$t %>% highlight_tb_count() 
16 3 5 6 Other
16 66 9 10 2 18
3 14 109 32 8 6
5 11 20 68 0 3
6 7 3 0 29 1
Other 9 3 5 0 8
Total 107 144 115 39 36
tb_elastic$t %>% highlight_tb_percent()
16 3 5 6 Other
16 0.62 0.06 0.09 0.05 0.5
3 0.13 0.76 0.28 0.21 0.17
5 0.1 0.14 0.59 0 0.08
6 0.07 0.02 0 0.74 0.03
Other 0.08 0.02 0.04 0 0.22
Total 100% 100% 100% 100% 100%